%matplotlib inline
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
protein = pd.read_csv("nuclear.csv")
# mean & stdev of each class
print('Mean(cm): \n', protein.groupby(['class']).mean())
print('-'*80,'\n')
print('Stdev(cm): \n', protein.groupby(['class']).std())
print('-'*80)
proteinData = [protein.iloc[:,1:18], protein.iloc[:,11:21], protein.iloc[:,21:31], protein.iloc[:,31:41],
protein.iloc[:,41:51], protein.iloc[:,51:61], protein.iloc[:,61:71], protein.iloc[:,71:78]]
#print(proteinData)
proteinClass = protein.iloc[:, 78:82]
#print(proteinClass)
# 挑出要分析的 attribute,和 class
sub_protein = pd.concat([proteinData[0],proteinClass], axis=1)
sub_protein.describe(percentiles=[])
na_cols = sub_protein.columns[sub_protein.isna().any()].tolist()
print(na_cols)
## drop
proteinClean = sub_protein.dropna()
#print(proteinClean)
## fill missing value with mean of each group
fill_protein = sub_protein.copy()
for n in na_cols:
fill_protein[n] = fill_protein.groupby(['class'], sort=False)[n].apply(lambda x: x.fillna(x.mean()))
#protein[x] = protein.groupby("class").transform(lambda x: x.fillna(x.mean()))
#for c in proteinClass['class'].unique():
#print(c)
#proteinFill = protein.fillna(,inplace=True)
# 印出有 missing value 的 column
## 未處理 missing value 的原資料
na_cols = sub_protein.columns[sub_protein.isna().any()].tolist()
print(na_cols)
## fill_protein 已用平均值填補完,沒有 missing value,印出 empty list
na_cols = fill_protein.columns[fill_protein.isna().any()].tolist()
print(na_cols)
sub_protein.describe(percentiles=[], include='all')
print(list(fill_protein.iloc[:,0:5]))
fill_protein.iloc[:,0:5].plot()
fig, ax = plt.subplots(figsize=(8, 6))
plt.suptitle('')
fill_protein.boxplot(column=['DYRK1A_N'], by='class', ax=ax)
fill_protein.iloc[:,0:4].hist(figsize=(8,6))
fill_protein.groupby('Genotype').hist(column=['DYRK1A_N'])
sns.pairplot(fill_protein, hue='class')
fill_protein.plot(kind='hist')
Relation = pd.concat([fill_protein['NR1_N'], fill_protein['pNR1_N']], axis=1)
Relation = pd.concat([Relation, proteinClass], axis=1)
sns.pairplot(Relation, hue='class')
sns.pairplot(Relation, hue='Genotype')
sns.pairplot(Relation, hue='Treatment', palette="husl")
sns.pairplot(Relation, hue='Behavior', palette="Set2")
Relation2 = pd.concat([fill_protein['DYRK1A_N'], fill_protein['ITSN1_N']], axis=1)
Relation2 = pd.concat([Relation2, proteinClass], axis=1)
sns.pairplot(Relation2, hue='class')
人眼對於二維圖形的大小比例關係並無法判斷的很精準, 也許判斷相對大小沒問題,但判斷絕對大小卻很困難。
google "pie chart" 其實也可看到很多爭論。
fill_protein.describe()
Q1 = fill_protein.quantile(q=0.25)
Q3 = fill_protein.quantile(q=0.75)
IQR = Q3-Q1
row_name=['Q3', 'Q1', 'IQR']
IQR_DF = pd.DataFrame([Q3, Q1, IQR], row_name)
IQR_DF.assign()
fill_protein.iloc[:,0:5].sort_values(by=['NR1_N']).head()
fill_protein.iloc[:,0:5].sort_values(by=['DYRK1A_N', 'ITSN1_N']).head()
pcorr=fill_protein.iloc[:,0:10].corr()
pcorr.assign()
pcorr.style.background_gradient().set_precision(2)
f, ax = plt.subplots(figsize=(10, 8))
sns.heatmap(pcorr, mask=np.zeros_like(pcorr, dtype=np.bool), cmap="Blues",
square=True, ax=ax)
fig, ax = plt.subplots(figsize=(8, 6))
plt.suptitle('')
fill_protein.boxplot(column=['DYRK1A_N'], by='class', ax=ax)
fig, ax = plt.subplots(figsize=(8, 6))
plt.suptitle('')
fill_protein.boxplot(column=['DYRK1A_N'], by='Genotype', ax=ax)
sns.set(style="white", palette="muted", color_codes=True)
# Set up the matplotlib figure
f, axes = plt.subplots(2, 2, figsize=(9,9))
sns.distplot(fill_protein['DYRK1A_N'], color="b", ax=axes[0, 0])
sns.distplot(fill_protein['ITSN1_N'], color="r", ax=axes[0, 1])
sns.distplot(fill_protein['BDNF_N'], color="g", ax=axes[1, 0])
sns.distplot(fill_protein['NR1_N'], color="m", ax=axes[1, 1])
#plt.setp(axes)
#plt.tight_layout()
plt.show()
Relation.describe(percentiles=[])
X, y = Relation.iloc[:,0].values.reshape(-1,1), Relation.iloc[:,1].values.reshape(-1,1)
from sklearn.metrics import mean_squared_error
row = ['linear', 'knn', 'ridge', 'linear^3', 'linear^4+intercept']
col = ['MSE', 'Cor']
regResult= pd.DataFrame(index=row, columns=col)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
from sklearn.linear_model import LinearRegression
# Train linear model by training set
reg = LinearRegression().fit(X_train, y_train)
# Make predictions using the testing set
y_pred_lm = reg.predict(X_test)
# The coefficients
print('Coefficients: \n', reg.coef_)
print('Intercept: \n', reg.intercept_)
print('linear model Correlation: \n', reg.score(X_train, y_train))
# Plot outputs
plt.scatter(X_test, y_test, color='black', label='test data')
plt.plot(X_test, y_pred_lm, color='blue', linewidth=3, label='linear model prediction')
regResult.iloc[0, 0] = mean_squared_error(y_test, y_pred_lm)
regResult.iloc[0, 1] = model1.score(X_train, y_train)
plt.legend()
plt.show()
並印出 kNN model 的 correlation
可以看到右上角有一個比較遠的點有點像離群值,當 test data 挑到那個點的時候會導致 knn model 有點歪掉
from sklearn import neighbors
# Fit regression model
k = 5
T = np.linspace(X_test.min(), X_test.max(), 108)[:, np.newaxis]
plt.figure(figsize=(12,12))
for i, weights in enumerate(['uniform', 'distance']):
knn = neighbors.KNeighborsRegressor(k, weights=weights)
y_pred_knn = knn.fit(X_train, y_train).predict(T)
plt.subplot(2, 1, i + 1)
plt.scatter(X_test, y_test, c='k', label='test data')
plt.plot(T, y_pred_knn, c='g', label='knn prediction', linewidth=4)
plt.plot(X_test, y_pred_lm, color='blue', linewidth=4, label='linear model prediction')
#plt.axis('tight')
plt.ylim(y_test.min()-y_test.min()/5, y_test.max()+y_test.max()/5)
plt.legend()
plt.title("KNeighborsRegressor (k = %i, weights = '%s')" % (k,
weights))
# The coefficients
print('kNN Correlation: \n', knn.score(X_train, y_train))
plt.tight_layout()
plt.show()
X_train2, X_test2, y_train2, y_test2 = train_test_split(X, y, test_size=0.2)
from sklearn import neighbors
# Fit regression model
k = 5
T = np.linspace(X_test2.min(), X_test2.max(), 216)[:, np.newaxis]
plt.figure(figsize=(12,12))
for i, weights in enumerate(['uniform', 'distance']):
knn = neighbors.KNeighborsRegressor(k, weights=weights)
y_pred_knn = knn.fit(X_train2, y_train2).predict(X_test2)
plt.subplot(2, 1, i + 1)
plt.scatter(X_test2, y_test2, c='k', label='data')
plt.plot(X_test2, y_pred_knn, c='g', label='prediction', linewidth=3)
plt.plot(X_test, y_pred_lm, color='blue', linewidth=3)
#plt.axis('tight')
plt.ylim(y_test2.min()-y_test2.min()/5, y_test2.max()+y_test2.max()/5)
plt.legend()
plt.title("KNeighborsRegressor (k = %i, weights = '%s')" % (k,
weights))
# The coefficients
print('Correlation: \n', knn.score(X_train, y_train))
plt.tight_layout()
plt.show()
from sklearn import neighbors
# Fit regression model
k = 5
T = np.linspace(X_test2.min(), X_test2.max(), 108)[:, np.newaxis]
plt.figure(figsize=(12,12))
for i, weights in enumerate(['uniform', 'distance']):
knn = neighbors.KNeighborsRegressor(k, weights=weights)
y_pred_knn = knn.fit(X_train2, y_train2).predict(np.sort(X_test2.reshape(1,-1)).reshape(-1,1))
plt.subplot(2, 1, i + 1)
plt.scatter(X_test2, y_test2, c='k', label='data')
plt.plot(np.sort(X_test2.reshape(1,-1)).reshape(-1,1), y_pred_knn, c='g', label='prediction', linewidth=3)
plt.plot(X_test, y_pred_lm, color='blue', linewidth=3)
#plt.axis('tight')
plt.ylim(y_test2.min()-y_test2.min()/5, y_test2.max()+y_test2.max()/5)
plt.legend()
plt.title("KNeighborsRegressor (k = %i, weights = '%s')" % (k,
weights))
regResult.iloc[1, 0] = mean_squared_error(y_test2, y_pred_knn)
regResult.iloc[1, 1] = model1.score(X_train2, y_train2)
# The coefficients
print('Correlation: \n', knn.score(X_train, y_train))
plt.tight_layout()
plt.show()
from sklearn.linear_model import Ridge
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
colors = ['red', 'orange', 'teal', 'yellowgreen', 'gold']
lw = 2
#plt.plot(X_train, y_train, color='cornflowerblue', linewidth=lw,
# label="ground truth")
plt.scatter(X_test2, y_test2, color='navy', s=30, marker='o', label="training points")
for count, degree in enumerate([1, 2, 3]):
model = make_pipeline(PolynomialFeatures(degree), Ridge())
model.fit(X_train2, y_train2)
y_polyRidge = model.predict(np.sort(X_test2.reshape(1,-1)).reshape(-1,1))
print('Correlation%d: \n' % degree, model.score(X_train2, y_train2))
plt.plot(np.sort(X_test2.reshape(1,-1)).reshape(-1,1), y_polyRidge, color=colors[count], linewidth=lw,
label="degree %d" % (degree))
regResult.iloc[2, 0] = mean_squared_error(y_test2, y_polyRidge)
regResult.iloc[2, 1] = model1.score(X_train2, y_train2)
plt.legend()
plt.show()
用淺綠細線表 model2 ( $\hat{y}_{m2} = w_0+ w_1X + w_2X^2 + w_3X^3 + w_4X^4$ )
雖然可以決定 model 要不要考慮 intercept 項,但因這兩個變數相關性太高,導致不管有沒有考慮 train 出來的 model 都長差不多。
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
import numpy as np
model1 = Pipeline([('poly', PolynomialFeatures(degree=3)),
('linear', LinearRegression(fit_intercept=False))])
# fit to an order-3 polynomial data
model1 = model1.fit(X_train2, y_train2)
model1.named_steps['linear'].coef_
model1.named_steps['linear'].intercept_
# The coefficients
print('Coefficients: \n', model1.named_steps['linear'].coef_)
print('Intercept: \n', model1.named_steps['linear'].intercept_)
print('Correlation: \n', model1.score(X_train2, y_train2))
y1 = model1.predict(np.sort(X_test2.reshape(1,-1)).reshape(-1,1))
regResult.iloc[3, 0] = mean_squared_error(y_test2, y1)
regResult.iloc[3, 1] = model1.score(X_train2, y_train2)
model2 = Pipeline([('poly', PolynomialFeatures(degree=4)),
('linear', LinearRegression(fit_intercept=True))])
# fit to an order-3 polynomial data
model2 = model2.fit(X_train2, y_train2)
model2.named_steps['linear'].coef_
model2.named_steps['linear'].intercept_
# The coefficients
print('Coefficients: \n', model2.named_steps['linear'].coef_)
print('Intercept: \n', model2.named_steps['linear'].intercept_)
print('Correlation: \n', model2.score(X_train2, y_train2))
y2 = model2.predict(np.sort(X_test2.reshape(1,-1)).reshape(-1,1))
regResult.iloc[4, 0] = mean_squared_error(y_test2, y2)
regResult.iloc[4, 1] = model2.score(X_train2, y_train2)
plt.figure(figsize=(8, 6))
plt.scatter(X_test2, y_test2, color='navy', s=30, marker='o', label="training points")
plt.plot(np.sort(X_test2.reshape(1,-1)).reshape(-1,1), y1, c='teal', label='fit_intercept=True', linewidth=5)
plt.plot(np.sort(X_test2.reshape(1,-1)).reshape(-1,1), y2, c='yellowgreen', label='fit_intercept=False')
plt.legend()
plt.show()
regResult.assign()
regResult.assign()